
rm(list=ls())
## load tsv data table

require(data.table)
data<- data.frame(fread("Al-Mutawa/meta_541_original.tsv"))

## 1.  remove metabolites with unknown chebi ID
data_chebi = data[sapply(data[,1], function(x) grepl("CHEBI", x) ),]

## 2.  remove columns with NAs
data_nonna = data_chebi[,!apply(is.na(data_chebi), 2, any)]

## 3. remove metabolites with >=50% zeros
data_metab = apply(data_nonna[,13:84], c(1,2), as.numeric)
metab_delete = rowMeans(data_metab==0) >= 0.5
data_541 = data_nonna[!metab_delete,]


## 4. save the clean data
library(xlsx)
write.xlsx(data_541,"Al-Mutawa/meta_541.xlsx")

## save chebi IDs for generating pathways
chebi_541 = data_541$database_identifier
write.table(chebi_541, file = "Al-Mutawa/chebi_541.txt",row.names = F,col.names = F, quote = FALSE)

## 5. save the data matrix ## choose tumour data only 
data_matrix = t(data_metab[!metab_delete,])[29:72,]
rownames(data_matrix)
colnames(data_matrix) = data_541$pattern_ID



## 6. normalized the data
hist(data_matrix)
if(min(data_matrix)>0){
  data_matrix_normal = log2(data_matrix)
  hist(data_matrix_normal)
} else{
  a = 1e+8 ## default value, could be other values 
  data_matrix_normal = log2((data_matrix + sqrt(data_matrix^2 + a^2))/2)
  hist(data_matrix_normal)
}

y = c(rep(1, 19),rep(0,25)) ## the first 19 samples are treated with high-oxygen level
d541 = cbind(data_matrix_normal, y  )
save(d541, file="Al-Mutawa/d541.RData")




############################   ========================= smpdb_HMDB_pathway =========================   ############################ 

smpdbhmdb <- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_smpdbhmdb.tsv"))
smpdbhmdbpath_chebi = smpdbhmdb$Submitted.IDs
names(smpdbhmdbpath_chebi) = smpdbhmdb$ID.Annotation
listsmpdbhmdbpath = lapply(smpdbhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listsmpdbhmdbpath_uniq = listsmpdbhmdbpath[which(!duplicated(listsmpdbhmdbpath))]

## map cheiID to metabolites identifies
smpdbhmdbpath <- lapply(listsmpdbhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_541$pattern_ID[which(chebi_541 %in% x )] )

smpdbhmdbpath_metab = smpdbhmdbpath[which(!duplicated(smpdbhmdbpath))]

save(smpdbhmdbpath_metab, file="Al-Mutawa/pathwaydatabase541/smpdbhmdbpath_metab_541.RData")




############################   ========================= Biocycpathway =========================   ############################ 

biocyc <- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_biocyc.tsv"))
biocycpath_chebi = biocyc$Submitted.IDs
names(biocycpath_chebi) = biocyc$ID.Annotation
listbiocycpath = lapply(biocycpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listbiocycpath_uniq = listbiocycpath[which(!duplicated(listbiocycpath))]

## map cheiID to metabolites identifies
biocycpath <- lapply(listbiocycpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_541$pattern_ID[which(chebi_541 %in% x )] )

biocycpath_metab = biocycpath[which(!duplicated(biocycpath))]

save(biocycpath_metab, file="Al-Mutawa/pathwaydatabase541/biocycpath_metab_541.RData")



############################   ========================= keggpathway =========================   ############################ 

kegg<- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_kegg.tsv"))
keggpath_chebi = kegg$Submitted.IDs
names(keggpath_chebi) = kegg$ID.Annotation
listkeggpath = lapply(keggpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listkeggpath_uniq = listkeggpath[which(!duplicated(listkeggpath))]

## map cheiID to metabolites identifies
keggpath <- lapply(listkeggpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_541$pattern_ID[which(chebi_541 %in% x )] )

keggpath_metab = keggpath[which(!duplicated(keggpath))]

save(keggpath_metab, file="Al-Mutawa/pathwaydatabase541/keggpath_metab_541.RData")





############################   ========================= wikipathway =========================   ############################ 
library(rWikiPathways)
listMetab <- NULL
Pathways<-listPathwayIds(organism="Homo sapiens")
wikipath_info<-data.frame(id=1:length(Pathways))
wikipath_info$names<-Pathways
#list of chebi IDs per pathway
listMetab<-apply(as.matrix(Pathways),1, function(x) getXrefList(pathway = x, systemCode="Ce"))

#remove chebiID character
listMetab<-lapply(listMetab, function(x) gsub('^\\CHEBI:|\\$', '', x))
listMetab<-lapply(listMetab, function(x) unique(x))
wikipath_info$chebi_path = listMetab
wikipath_info$path_length<-sapply(listMetab, function(x) length(x))


#map chebi IDs to  your own metabolite names
wikiMetab<-lapply(listMetab, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_541$pattern_ID[which(sapply(chebi_541, function(i) gsub('^\\CHEBI:|\\$', '', i)) %in% x)])  

# remove empty pathways
wikipath = wikiMetab[sapply(wikiMetab,length)>0]
wikipath_uniq = wikipath[which(!duplicated(wikipath))]
save(wikipath_uniq, file="Al-Mutawa/wikipath_metab_541.RData")



############################   ========================= proteinhmdb =========================   ############################ 

proteinhmdb<- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_proteinhmdb.tsv"))
proteinhmdbpath_chebi = proteinhmdb$Submitted.IDs
names(proteinhmdbpath_chebi) = proteinhmdb$ID.Annotation
listproteinhmdbpath = lapply(proteinhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listproteinhmdbpath_uniq = listproteinhmdbpath[which(!duplicated(listproteinhmdbpath))]

## map cheiID to metabolites identifies
proteinhmdbpath <- lapply(listproteinhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_541$pattern_ID[which(chebi_541 %in% x )] )

proteinhmdbpath_metab = proteinhmdbpath[which(!duplicated(proteinhmdbpath))]

save(proteinhmdbpath_metab, file="Al-Mutawa/pathwaydatabase541/proteinhmdbpath_metab_541.RData")






# ############################   ========================= smpdb_ECMDB_pathway =========================   ############################ 
# 
# smpdbecmdb <- data.frame(fread("Al-Mutawa/mbrole2_smpdbecmdb.tsv"))
# smpdbecmdbpath_chebi = smpdbecmdb$Submitted.IDs
# names(smpdbecmdbpath_chebi) = smpdbecmdb$ID.Annotation
# listsmpdbecmdbpath = lapply(smpdbecmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbecmdbpath_uniq = listsmpdbecmdbpath[which(!duplicated(listsmpdbecmdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbecmdbpath <- lapply(listsmpdbecmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_541$pattern_ID[which(chebi_541 %in% x )] )
# 
# smpdbecmdbpath_metab = smpdbecmdbpath[which(!duplicated(smpdbecmdbpath))]
# 
# save(smpdbecmdbpath_metab, file="Al-Mutawa/smpdbecmdbpath_metab_541.RData")
# 
# 
# 
# 
# ############################   ========================= smpdb_YMDB_pathway =========================   ############################ 
# 
# smpdbymdb <- data.frame(fread("Al-Mutawa/mbrole2_smpdbymdb.tsv"))
# smpdbymdbpath_chebi = smpdbymdb$Submitted.IDs
# names(smpdbymdbpath_chebi) = smpdbymdb$ID.Annotation
# listsmpdbymdbpath = lapply(smpdbymdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbymdbpath_uniq = listsmpdbymdbpath[which(!duplicated(listsmpdbymdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbymdbpath <- lapply(listsmpdbymdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_541$pattern_ID[which(chebi_541 %in% x )] )
# 
# smpdbymdbpath_metab = smpdbymdbpath[which(!duplicated(smpdbymdbpath))]
# 
# save(smpdbymdbpath_metab, file="Al-Mutawa/smpdbymdbpath_metab_541.RData")




# ############################   ========================= biofunction =========================   ############################ 
# 
# biofunction<- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_biofunction.tsv"))
# biofunctionpath_chebi = biofunction$Submitted.IDs
# names(biofunctionpath_chebi) = biofunction$ID.Annotation
# listbiofunctionpath = lapply(biofunctionpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listbiofunctionpath_uniq = listbiofunctionpath[which(!duplicated(listbiofunctionpath))]
# 
# ## map cheiID to metabolites identifies
# biofunctionpath <- lapply(listbiofunctionpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_541$pattern_ID[which(chebi_541 %in% x )] )
# 
# biofunctionpath_metab = biofunctionpath[which(!duplicated(biofunctionpath))]
# 
# save(biofunctionpath_metab, file="Al-Mutawa/pathwaydatabase541/biofunctionpath_metab_541.RData")
# 
# 
# ############################   ========================= chebirole =========================   ############################ 
# 
# chebirole<- data.frame(fread("Al-Mutawa/pathwaydatabase541/mbrole2_chebirole.tsv"))
# chebirolepath_chebi = chebirole$Submitted.IDs
# names(chebirolepath_chebi) = chebirole$ID.Annotation
# listchebirolepath = lapply(chebirolepath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listchebirolepath_uniq = listchebirolepath[which(!duplicated(listchebirolepath))]
# 
# ## map cheiID to metabolites identifies
# chebirolepath <- lapply(listchebirolepath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_541$pattern_ID[which(chebi_541 %in% x )] )
# 
# chebirolepath_metab = chebirolepath[which(!duplicated(chebirolepath))]
# 
# save(chebirolepath_metab, file="Al-Mutawa/pathwaydatabase541/chebirolepath_metab_541.RData")
# 



